suppressMessages(library(scater))
suppressMessages(library(ggplot2))
suppressMessages(library(ggpubr))
yg0 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg0.rds")
yg1 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg1.rds")
yg3 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg3.rds")
yg5 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg5.rds")
old0 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old0.rds")
old1 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old1.rds")
old3 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old3.rds")
old5 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old5.rds")
yg0_mat <- counts(yg0)
yg1_mat <- counts(yg1)
yg3_mat <- counts(yg3)
yg5_mat <- counts(yg5)
old0_mat <- counts(old0)
old1_mat <- counts(old1)
old3_mat <- counts(old3)
old5_mat <- counts(old5)
main_matrix <- cbind(as.matrix(yg0_mat),
as.matrix(yg1_mat),
as.matrix(yg3_mat),
as.matrix(yg5_mat),
as.matrix(old0_mat),
as.matrix(old1_mat),
as.matrix(old3_mat),
as.matrix(old5_mat))
keep_genes <- rowSums(main_matrix) > 0
table(keep_genes)
## keep_genes
## FALSE TRUE
## 5146 27139
main_matrix <- main_matrix[keep_genes,]
dim(main_matrix)
## [1] 27139 65006
sample <- c(rep("YG0",7505),
rep("YG1", 5724),
rep("YG3", 7284),
rep("YG5", 6111),
rep("OLD0", 8237),
rep("OLD1", 7596),
rep("OLD3", 5647),
rep("OLD5", 16902)
)
table(sample)
## sample
## OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## 8237 7596 5647 16902 7505 5724 7284 6111
sorting_day <- c(rep("day1",7505),
rep("day2", 5724),
rep("day1", 7284),
rep("day2", 6111),
rep("day1", 8237),
rep("day2", 7596),
rep("day1", 5647),
rep("day2", 16902)
)
table(sorting_day)
## sorting_day
## day1 day2
## 28673 36333
table(sorting_day, sample)
## sample
## sorting_day OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## day1 8237 0 5647 0 7505 0 7284 0
## day2 0 7596 0 16902 0 5724 0 6111
lane <- c(rep("lane1",7505),
rep("lane1", 5724),
rep("lane2", 7284),
rep("lane2", 6111),
rep("lane1", 8237),
rep("lane1", 7596),
rep("lane2", 5647),
rep("lane2", 16902)
)
table(lane)
## lane
## lane1 lane2
## 29062 35944
table(lane, sample)
## sample
## lane OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## lane1 8237 7596 0 0 7505 5724 0 0
## lane2 0 0 5647 16902 0 0 7284 6111
total_counts <- c(yg0$total_counts,
yg1$total_counts,
yg3$total_counts,
yg5$total_counts,
old0$total_counts,
old1$total_counts,
old3$total_counts,
old5$total_counts)
total_features <- c(yg0$total_features,
yg1$total_features,
yg3$total_features,
yg5$total_features,
old0$total_features,
old1$total_features,
old3$total_features,
old5$total_features)
pct_mt <- c(yg0$percentage_mt,
yg1$percentage_mt,
yg3$percentage_mt,
yg5$percentage_mt,
old0$percentage_mt,
old1$percentage_mt,
old3$percentage_mt,
old5$percentage_mt)
df_colData <- data.frame(total_counts <- total_counts,
total_features <- total_features,
pct_mt <- pct_mt,
log10_total_counts <- log10(total_counts+1),
log10_total_features <- log10(total_features +1),
log10_percentage_mt <- log10(pct_mt + 0.01))
colnames(df_colData) <- c("total_counts",
"total_features",
"percentage_mt",
"log10_total_counts",
"log10_total_features",
"log10_percentage_mt")
df_colData[1:5,]
summary(df_colData$total_counts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 500 2238 8042 9017 13408 74630
quantile(df_colData$total_counts, c(0.05,0.95))
## 5% 95%
## 746.00 22724.75
summary(df_colData$total_features)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57 1092 2537 2424 3445 7963
quantile(df_colData$total_features, c(0.05,0.95))
## 5% 95%
## 441 4666
summary(df_colData$percentage_mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.315 1.970 3.527 2.811 97.670
quantile(df_colData$percentage_mt, c(0.05,0.95))
## 5% 95%
## 0.3830922 8.8435374
summary(df_colData$log10_total_counts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.700 3.350 3.905 3.752 4.127 4.873
quantile(df_colData$log10_total_counts, c(0.05,0.95))
## 5% 95%
## 2.873321 4.356518
summary(df_colData$log10_total_features)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.763 3.039 3.404 3.282 3.537 3.901
quantile(df_colData$log10_total_features, c(0.05,0.95))
## 5% 95%
## 2.645422 3.669038
summary(df_colData$log10_percentage_mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0000 0.1222 0.2968 0.2816 0.4505 1.9898
quantile(df_colData$log10_percentage_mt, c(0.05,0.95))
## 5% 95%
## -0.4055058 0.9471168
gghistogram(data=df_colData,
x = "total_counts",
y = "..count..",
bins=(93),
alpha = .2,
color="black",
fill="red",
title="Library size",
ylab="#cells",
xlab="Total counts"
) +
theme_bw() +
theme(text = element_text(size=40))
gghistogram(data=df_colData,
x = "log10_total_counts",
y = "..count..",
bins=(94),
alpha = .2,
color="black",
fill="white",
title="Library size",
ylab="#cells",
xlab="log10(total counts)"
) +
theme_bw() +
theme(text = element_text(size=20)) + geom_vline(xintercept = 3.5, size = 1, col = "red") +
scale_x_continuous(breaks = seq(3, 4.5, by=0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print("h")
## [1] "h"
table(df_colData$log10_total_counts >= 3.5)
##
## FALSE TRUE
## 19850 45156
gghistogram(data=df_colData,
x = "total_features",
y = "..count..",
bins=(94),
alpha = .2,
color="black",
fill="red",
title="Total features detected",
ylab="#cells",
xlab="Total features"
) +
theme_bw() +
theme(text = element_text(size=40))
gghistogram(data=df_colData,
x = "log10_total_features",
y = "..count..",
bins=(94),
alpha = .2,
color="black",
fill="white",
title="Number of detected features",
ylab="#cells",
xlab="log10(total features)"
) +
theme_bw() +
theme(text = element_text(size=20)) + geom_vline(xintercept = 3, col = "red", size = 1)
table(df_colData$log10_total_counts >= 3.2)
##
## FALSE TRUE
## 12331 52675
ggplot(df_colData,
aes(y=total_features, x=total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("Total counts") + ylab("Unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank())
ggplot(df_colData,
aes(y=total_features, x=total_counts, color = sample)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("Total counts") + ylab("Unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
scale_color_manual(values=c("blue","green", "grey", "red", "orange",
"yellow", "pink", "cyan"))
ggplot(df_colData,
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank())
ggplot(df_colData,
aes(y=log10_total_features, x=log10_total_counts, color = sample)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
scale_color_manual(values=c("blue","green", "grey", "red", "orange",
"yellow", "pink", "cyan"))
ggplot(df_colData,
aes(y=percentage_mt, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("log10(total counts)") + ylab("% mt-genes")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_hline(yintercept = 10, col = "red")
ggplot(df_colData,
aes(y=percentage_mt, x=log10_total_counts,
color = log10(main_matrix["S100a8",]+1)
))+
geom_jitter(size=1, alpha=0.75) +
ggtitle("Coloured by S100a8 expression") +
xlab("log10 Total counts") + ylab("Percentage mt-genes")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
ylim(0,25) +
scale_colour_gradient(low = "grey", high = "red", na.value = NA)
## Warning: Removed 1511 rows containing missing values (`geom_point()`).
ggplot(df_colData,
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("") +
xlab("log10(total counts)") + ylab("log10(total features)")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5, col = "red") + geom_hline(yintercept = 3, col = "red")
ggplot(df_colData[sample == "YG0",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - YG0") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "YG1",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - YG1") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "YG3",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - YG3") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "YG5",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - YG5") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "OLD0",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - OLD0") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "OLD1",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - OLD1") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "OLD3",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - OLD3") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
ggplot(df_colData[sample == "OLD5",],
aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("Cutoff suggestion - OLD5") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)
Keep cells with total counts >= 3.0 and total features >= 2.5
keep_cells <- df_colData$log10_total_counts >= 3.5 & df_colData$log10_total_features >= 3 &
df_colData$percentage_mt < 10
table(keep_cells)
## keep_cells
## FALSE TRUE
## 20158 44848
gghistogram(data=df_colData[keep_cells,],
x = "log10_total_counts",
y = "..count..",
bins=(94),
alpha = .2,
color="black",
fill="white",
title="Library size - after QC",
ylab="#cells",
xlab="log10(total counts)"
) +
theme_bw() +
theme(text = element_text(size=20))
gghistogram(data=df_colData[keep_cells,],
x = "log10_total_features",
y = "..count..",
bins=(94),
alpha = .2,
color="black",
fill="white",
title="Number of detected features - after QC",
ylab="#cells",
xlab="log10(total features)"
) +
theme_bw() +
theme(text = element_text(size=20))
ggplot(df_colData[keep_cells,],
aes(y=log10_total_features, x=log10_total_counts,
color = (log10(main_matrix["S100a8",]+1)[keep_cells]))) +
geom_jitter(size=1, alpha=0.75) +
ggtitle("After QC - coloured by S100a8 expression") +
xlab("log10 Total counts") + ylab("log10 unique features detected")+
theme_bw() +
theme(text = element_text(size=30), legend.title=element_blank()) +
scale_colour_gradient(low = "grey", high = "red", na.value = NA)
table(keep_cells, sample)
## sample
## keep_cells OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## FALSE 1328 1425 585 12610 1436 946 1186 642
## TRUE 6909 6171 5062 4292 6069 4778 6098 5469
table(sample)
## sample
## OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## 8237 7596 5647 16902 7505 5724 7284 6111
pct_excluded <- as.matrix(table(keep_cells, sample))
pct_excluded
## sample
## keep_cells OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## FALSE 1328 1425 585 12610 1436 946 1186 642
## TRUE 6909 6171 5062 4292 6069 4778 6098 5469
pct_excluded[1,]/as.vector(table(sample))*100
## OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## 16.12237 18.75987 10.35948 74.60656 19.13391 16.52690 16.28226 10.50565
pct_excluded[2,]/as.vector(table(sample))*100
## OLD0 OLD1 OLD3 OLD5 YG0 YG1 YG3 YG5
## 83.87763 81.24013 89.64052 25.39344 80.86609 83.47310 83.71774 89.49435
main_matrix <- main_matrix[,keep_cells]
dim(main_matrix)
## [1] 27139 44848
sce <- SingleCellExperiment(assays = list(counts = as.matrix(main_matrix)))
rowData(sce)$feature_ids <- yg1$feature.ids[keep_genes]
genes_to_keep <- apply(
counts(sce),
1,
function(x) length(x[x > 1]) >= 1 )
table(genes_to_keep)
## genes_to_keep
## FALSE TRUE
## 5796 21343
sce <- sce[genes_to_keep,]
sce
## class: SingleCellExperiment
## dim: 21343 44848
## metadata(0):
## assays(1): counts
## rownames(21343): Xkr4 Rp1 ... CAAA01147332.1 AC149090.1
## rowData names(0):
## colnames(44848): AAACCCAAGCCTTTCC-1 AAACCCAAGTCTTCCC-1 ...
## TTTGTTGTCGCTTACC-1 TTTGTTGTCTCTAAGG-1
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
grep("mt-", rownames(sce), value = TRUE)
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
mt_genes <- grep("mt-", rownames(sce), value = TRUE)
QC_metrics <- perCellQCMetrics(sce, exprs_values="counts", subsets=list(MT = mt_genes))
QC_metrics[1:5,]
## DataFrame with 5 rows and 6 columns
## sum detected subsets_MT_sum subsets_MT_detected
## <numeric> <numeric> <numeric> <numeric>
## AAACCCAAGCCTTTCC-1 9103 2857 323 13
## AAACCCAAGTCTTCCC-1 17564 4075 329 12
## AAACCCACAGACACAG-1 12299 3433 363 12
## AAACCCAGTACCTATG-1 7512 2515 161 11
## AAACCCAGTCACTTCC-1 34858 5044 790 13
## subsets_MT_percent total
## <numeric> <numeric>
## AAACCCAAGCCTTTCC-1 3.54828 9103
## AAACCCAAGTCTTCCC-1 1.87315 17564
## AAACCCACAGACACAG-1 2.95146 12299
## AAACCCAGTACCTATG-1 2.14324 7512
## AAACCCAGTCACTTCC-1 2.26634 34858
sce$total_counts <- QC_metrics$sum
sce$log10_total_counts <- log10(QC_metrics$sum+1)
sce$log10_total_features <- log10(QC_metrics$detected+1)
sce$total_features <- QC_metrics$detected
sce$pct_mt <- QC_metrics$subsets_MT_percent
sce$log10_pct_mt <- log10(QC_metrics$subsets_MT_percent+0.01)
sce$sample <- sample[keep_cells]
sce$sorting_day <- sorting_day[keep_cells]
sce$lane <- lane[keep_cells]
sce
## class: SingleCellExperiment
## dim: 21343 44848
## metadata(0):
## assays(1): counts
## rownames(21343): Xkr4 Rp1 ... CAAA01147332.1 AC149090.1
## rowData names(0):
## colnames(44848): AAACCCAAGCCTTTCC-1 AAACCCAAGTCTTCCC-1 ...
## TTTGTTGTCGCTTACC-1 TTTGTTGTCTCTAAGG-1
## colData names(9): total_counts log10_total_counts ... sorting_day lane
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
#saveRDS(sce, "/mnt/nmorais-nfs/marta/pB_joana-data/pC_data/sce_all-samples-after-qc.rds")